Section I: Set-Up and Summary of the Dataset

First, we read in the data and set it up for analysis. The data is mostly cleaned, but we need a subset for calculating correlation, we need to change some data to be categorical, some data to be numerical, and we need to fix the dates so that they aren’t read in as characters.

Without doing anything, our dataset is as follows:

After cleaning, our main dataset is described below:

Our secondary dataset (used to measure correlation) is described below:

Section II: Descriptive Statistics

Our dataset has 48895 observations.

Table: Statistics summary.
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
Min Min. : 2539 Length:48895 Min. :2.44e+03 Length:48895 Bronx : 1091 Williamsburg : 3920 Min. :40.5 Min. :-74.2 Entire home/apt:25409 Min. : 0 Min. : 1 Min. : 0 Min. :2011-03-28 Min. : 0 Min. : 1 Min. : 0
Q1 1st Qu.: 9471945 Class :character 1st Qu.:7.82e+06 Class :character Brooklyn :20104 Bedford-Stuyvesant: 3714 1st Qu.:40.7 1st Qu.:-74.0 Private room :22326 1st Qu.: 69 1st Qu.: 1 1st Qu.: 1 1st Qu.:2018-07-08 1st Qu.: 0 1st Qu.: 1 1st Qu.: 0
Median Median :19677284 Mode :character Median :3.08e+07 Mode :character Manhattan :21661 Harlem : 2658 Median :40.7 Median :-74.0 Shared room : 1160 Median : 106 Median : 3 Median : 5 Median :2019-05-19 Median : 1 Median : 1 Median : 45
Mean Mean :19017143 NA Mean :6.76e+07 NA Queens : 5666 Bushwick : 2465 Mean :40.7 Mean :-74.0 NA Mean : 153 Mean : 7 Mean : 23 Mean :2018-10-04 Mean : 1 Mean : 7 Mean :113
Q3 3rd Qu.:29152178 NA 3rd Qu.:1.07e+08 NA Staten Island: 373 Upper West Side : 1971 3rd Qu.:40.8 3rd Qu.:-73.9 NA 3rd Qu.: 175 3rd Qu.: 5 3rd Qu.: 24 3rd Qu.:2019-06-23 3rd Qu.: 2 3rd Qu.: 2 3rd Qu.:227
Max Max. :36487245 NA Max. :2.74e+08 NA NA Hell’s Kitchen : 1958 Max. :40.9 Max. :-73.7 NA Max. :10000 Max. :1250 Max. :629 Max. :2019-07-08 Max. :58 Max. :327 Max. :365
NA NA NA NA NA NA (Other) :32209 NA NA NA NA NA NA NA’s :10052 NA’s :10052 NA NA

Section III: Plots and Graphs

Scatter Plot for Price and Number of Reviews

Scatter plot for price and number of reviews taking the \[\log (reviews)\] to show linear trend:

library(ggplot2)
library(ggpubr)
ggplot(airbnb, aes(x=price, y=log(number_of_reviews))) + 
  ggtitle("Number of Reviews vs Price Scatter Plot") + 
  xlab("Price ($)") + ylab("Number Of Reviews") + 
  geom_point(size = 1, shape = 18, color = "black") + 
  geom_smooth(method = lm, se = FALSE, color = "yellow", size = 1.2) + theme_bw() + 
  stat_cor(method =  "pearson", label.x = 6500 )

Box Plot for Price and Neighborhood Group

Note: outliers extend past $1,000 per night, graph truncated for visibility and interpretation.

library(ggplot2)
ggplot(airbnb, aes(price, factor(neighbourhood_group))) + 
  geom_boxplot(color = "black", fill = c("light green", "pink","light blue", "yellow", "red")) +
  labs(title = "Neighbourhood group vs Price Box plot", x = "Price", y = "Neighbourhood") +
  xlim(0, 1000)

Box Plot for Price and Room Type

Note: outliers extend past $1,000 per night, graph truncated for visibility and interpretation.

library(ggplot2)
ggplot(airbnb, aes(price, factor(room_type))) + 
  geom_boxplot(width = 0.7, color = "black", fill = c("light green", "yellow","light blue")) +
  labs(title = "Room type vs Price Box plot", x = "Price", y = "Room Type") + xlim(0, 1000)

Map of NY by Price

AirBnB Density by Neighborhood Group and Price:

library(ggmap)
library(tmaptools)
library(tidyr)
ggmap(get_stamenmap(rbind(as.numeric(paste(geocode_OSM("New York")$bbox))), zoom = 10)) + 
  geom_point(data = airbnb, aes(x = longitude, y = latitude, colour = neighbourhood_group, size = price), alpha = 0.2)

AirBnB Average Price and Number of Listings by Neighborhood:

# create subset just for aggregating by mean
airbnb_map <- airbnb[ , c(6, 7, 8, 10)]
airbnb_map_means <- aggregate(.~neighbourhood, airbnb_map, mean)

# create subset for aggregating by count
airbnb_count <- airbnb_map
airbnb_count$count <- 1
airbnb_count <- airbnb_count[, c(1,5)]
airbnb_counter <- aggregate(.~neighbourhood, airbnb_count, sum)

# create full dataset from both subsets
airbnb_map_full <- cbind(airbnb_counter, airbnb_map_means)

# check that union occured correctly, then drop extra neighborhood value
all.equal(airbnb_map_full[, 1], airbnb_map_full[, 3]) # true!
airbnb_map_full <- airbnb_map_full[, -3]
library(ggmap)
library(tmaptools)
library(tidyr)
ggmap(get_stamenmap(rbind(as.numeric(paste(geocode_OSM("New York")$bbox))), zoom = 10)) + 
  geom_point(data = airbnb_map_full, aes(x = longitude, y = latitude, colour = price, size = count), alpha = 0.5) + scale_colour_gradientn(colours=rainbow(3))

# Section IV: Correlation and ANOVA Tests

Correlation

Correlation Matrix for Airbnb Data

loadPkg("faraway")
loadPkg("corrplot")
xkabledply(cor(airbnb_cor))
Table
price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365
price 1.0000 0.0428 -0.0480 NA 0.0575 0.0818
minimum_nights 0.0428 1.0000 -0.0801 NA 0.1280 0.1443
number_of_reviews -0.0480 -0.0801 1.0000 NA -0.0724 0.1720
reviews_per_month NA NA NA 1 NA NA
calculated_host_listings_count 0.0575 0.1280 -0.0724 NA 1.0000 0.2257
availability_365 0.0818 0.1443 0.1720 NA 0.2257 1.0000
airbnb_corplot = cor(airbnb_cor, use = "complete.obs")
corrplot(airbnb_corplot, method = "circle")

No strong correlations with price, but minimum_nights and availability_365, number_of_reviews and availability_365, and calculated_host_listings_count and availability_365 show some evidence of positive correlation.

Correlation Between Reviews per Month and Total Reviews

cor.test(x=airbnb_cor$reviews_per_month, y=airbnb_cor$number_of_reviews)
## 
##  Pearson's product-moment correlation
## 
## data:  airbnb_cor$reviews_per_month and airbnb_cor$number_of_reviews
## t = 130, df = 38841, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.543 0.557
## sample estimates:
##  cor 
## 0.55

As expected, correlated since reviews per month is a function of total number of reviews so do not need to look at both.

Correlation Between Number of Reviews (Y) and Price (X)

cor.test(y=airbnb$number_of_reviews, x=airbnb$price)
## 
##  Pearson's product-moment correlation
## 
## data:  airbnb$price and airbnb$number_of_reviews
## t = -11, df = 48893, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0568 -0.0391
## sample estimates:
##    cor 
## -0.048

No evidence of strong (linear) correlation, but evidence of an inverse relationship between price and reviews (higher price, fewer reviews–possibly because of fewer stays, for which review number is probably a good proxy)

ANOVA Tests

Testing for Differences in Price by Neighborhood Group

#anova test for price and neighborhood groups
anova_price_group = aov(price ~ neighbourhood_group, data=airbnb)
anova_price_group
summary(anova_price_group) -> sum_anova_price_group
xkabledply(sum_anova_price_group, title = "ANOVA result summary for Neighborhood Groups")

tukeyAoV_pg <- TukeyHSD(anova_price_group)
tukeyAoV_pg

Testing for Differences in Price by Room Type

#anova test for price and room type
anova_price_room = aov(price ~ room_type, data=airbnb)
anova_price_room
summary(anova_price_room) -> sum_anova_price_room
xkabledply(sum_anova_price_room, title = "ANOVA result summary for Room Type")

tukeyAoV_pr <- TukeyHSD(anova_price_room)
tukeyAoV_pr